{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "word_segmentation.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "8ZchBxzeH05p"
      },
      "outputs": [],
      "source": [
        "#important library imports\n",
        "import numpy as np\n",
        "import cv2 \n",
        "import os\n",
        "from google.colab.patches import cv2_imshow\n",
        "from scipy.signal import find_peaks, peak_prominences\n",
        "from google.colab import files\n",
        "from IPython.display import clear_output"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#upload your line images in a zipped folder called lines\n",
        "!unzip \"/content/lines.zip\" -d \"/content/\"\n",
        "#clear console after inflating\n",
        "clear_output()"
      ],
      "metadata": {
        "id": "VnkrClVOIoCf"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# list the file names inside the folder called lines\n",
        "filenames=os.listdir('/content/lines')\n",
        "# strip filenames from the file extension for further use\n",
        "filenames_split=[filename.replace('.tif', '') for filename in filenames]"
      ],
      "metadata": {
        "id": "J2omaiztIy1G"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def directionalHistogram(img, direction='H'):\n",
        "  # A function to calculate the intensity histograms of an image in x and y directions\n",
        "    (w,h) = img.shape\n",
        "    sum = []\n",
        "    pixel_count=0\n",
        "    if(direction=='H'):\n",
        "        for j in range(w-1):\n",
        "          for i in range(h-1):\n",
        "            pixel=img[j,i]\n",
        "            if(pixel==255):\n",
        "              pixel_count+=1\n",
        "          sum.append(pixel_count)\n",
        "          pixel_count=0\n",
        "    else:\n",
        "       for j in range(h-1):\n",
        "          for i in range(w-1):\n",
        "            pixel=img[i,j]\n",
        "            if(pixel==255):\n",
        "              pixel_count+=1\n",
        "          sum.append(pixel_count)\n",
        "          pixel_count=0\n",
        "    return sum\n",
        "\n",
        "def cropLineToWords(viable_sequences, image):\n",
        "  #Given a line image and the cutpositions, this functions return the images\n",
        "  #of the words contained in a line\n",
        "  (w,h) = image.shape\n",
        "  words=[]\n",
        "  for i in range(len(viable_sequences)):\n",
        "    if(i >0 and i< len(viable_sequences)):\n",
        "      words.append(image[0:w-1, viable_sequences[i-1]:viable_sequences[i]])\n",
        "    elif(i== len(viable_sequences)-1):\n",
        "      words.append(image[0:w-1, viable_sequences[i]:len(viable_sequences)])\n",
        "  return words\n",
        "\n",
        "def removeSpaces(words):\n",
        "  words_without_spaces=[]\n",
        "  for i in range(len(words)):\n",
        "    if(np.sum(words[i][:,:]>0)):\n",
        "      words_without_spaces.append(words[i])\n",
        "  return words_without_spaces  "
      ],
      "metadata": {
        "id": "5AL-IExQJBP2"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#make a duirectory that will hold the output words\n",
        "!mkdir words\n",
        "\n",
        "#loop over all the lines in your line images folder\n",
        "for m in range(len(filenames)):\n",
        "\n",
        "  path='/content/lines/'+filenames[m]\n",
        "  filename=filenames_split[m]\n",
        "  words=[]\n",
        "\n",
        "  #read the line image in grey-scale\n",
        "  img=cv2.imread(path, 0)\n",
        "  #get dimensions of the image\n",
        "  (w,h) = img.shape\n",
        "  #compute the intensity histogram in the y-direction\n",
        "  hist_vertical=directionalHistogram(img, direction='V')\n",
        "  #find the locations where the vertical histogram is zero (background spaces between words)\n",
        "  zero_sites=np.where(np.asarray(hist_vertical)==0)\n",
        "  zero_sites=zero_sites[0]\n",
        "\n",
        "  sequences=[]\n",
        "  sequence_start=0\n",
        "\n",
        "  #get the start and end of zero sequences in the vertical histogram\n",
        "  for i in range(1,len(zero_sites)):\n",
        "    last_zero=zero_sites[i-1]\n",
        "    current_zero=zero_sites[i]\n",
        "    if(current_zero!=last_zero+1):\n",
        "      sequence_end=last_zero\n",
        "      sequences.append([sequence_start,sequence_end])\n",
        "      sequence_start=current_zero\n",
        "    if(current_zero==last_zero+1 and i==len(zero_sites)-1):\n",
        "      sequence_start=sequence_end\n",
        "      sequence_end=current_zero\n",
        "      sequences.append([sequence_start,sequence_end])\n",
        "  sequence_lengths=[]\n",
        "  for i in range(len(sequences)):\n",
        "     sequence_lengths.append(sequences[i][1]-sequences[i][0]+1)\n",
        "  \n",
        "\n",
        "  #Threshold the size of the zero sequences (whether it is big enough to consider it as\n",
        "  # an interword spacing or small enough to consider as intraword spacing)\n",
        "  sequence_ratio=np.asarray(sequence_lengths)/w\n",
        "  average_sequence_length=np.sum(sequence_lengths[1:len(sequence_lengths)-1])/len(sequence_lengths)\n",
        "  viable_sequences=[]\n",
        "  overlap_factor=0.75*average_sequence_length\n",
        "  viable_sequences_unrolled=[]\n",
        "\n",
        "  for i in range(len(sequences)):\n",
        "      if(sequence_lengths[i]>=average_sequence_length-overlap_factor):\n",
        "        viable_sequences.append(sequences[i])\n",
        "        viable_sequences_unrolled.append(sequences[i][0])\n",
        "        viable_sequences_unrolled.append(sequences[i][1])  \n",
        "\n",
        "  viable_sequences_unrolled.append(-1)\n",
        "  if(viable_sequences_unrolled[0]!=0):\n",
        "        viable_sequences_unrolled=[0]+viable_sequences_unrolled\n",
        "  words.append(cropLineToWords(viable_sequences_unrolled, img))\n",
        "\n",
        "  ordered_words=[]\n",
        "\n",
        "#remove the spaces (word images with blank black background)\n",
        "  for i in range(len(words[0])):\n",
        "    word=words[0][i]\n",
        "    sum=np.sum(words[0][i][:,:])\n",
        "    if(sum):\n",
        "      ordered_words.append(word)\n",
        "    else:\n",
        "      ordered_words.append('space')\n",
        "\n",
        "#save word images to the word directory\n",
        "  count=0\n",
        "  for i in range(len(ordered_words)):\n",
        "      if(not type(ordered_words[i]) is str):\n",
        "        count+=1\n",
        "        cv2.imwrite(\"/content/words/\"+filename+'_word'+str(count)+\".tif\", ordered_words[i])\n",
        "        with open(\"/content/words/\"+filename+'_word'+str(count)+\".txt\", 'w') as fid: # 'w' creates a new file\n",
        "          fid.write('')\n",
        "\n",
        "#download word directory as a zipped folder\n",
        "!zip -r /content/words.zip /content/words \n",
        "files.download(\"/content/words.zip\") "
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 809
        },
        "id": "i71WXm6BJUzB",
        "outputId": "a488cc35-1675-4fac-ee28-e0a90eca3a57"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  adding: content/words/ (stored 0%)\n",
            "  adding: content/words/example_word22.tif (deflated 35%)\n",
            "  adding: content/words/example_word13.tif (deflated 12%)\n",
            "  adding: content/words/example_word3.txt (stored 0%)\n",
            "  adding: content/words/example_word13.txt (stored 0%)\n",
            "  adding: content/words/example_word18.tif (deflated 3%)\n",
            "  adding: content/words/example_word20.txt (stored 0%)\n",
            "  adding: content/words/example_word14.txt (stored 0%)\n",
            "  adding: content/words/example_word8.txt (stored 0%)\n",
            "  adding: content/words/example_word3.tif (deflated 10%)\n",
            "  adding: content/words/example_word4.txt (stored 0%)\n",
            "  adding: content/words/example_word20.tif (deflated 21%)\n",
            "  adding: content/words/example_word7.tif (deflated 10%)\n",
            "  adding: content/words/example_word19.txt (stored 0%)\n",
            "  adding: content/words/example_word7.txt (stored 0%)\n",
            "  adding: content/words/example_word6.tif (deflated 12%)\n",
            "  adding: content/words/example_word11.txt (stored 0%)\n",
            "  adding: content/words/example_word16.txt (stored 0%)\n",
            "  adding: content/words/example_word12.tif (deflated 11%)\n",
            "  adding: content/words/example_word21.txt (stored 0%)\n",
            "  adding: content/words/example_word15.tif (deflated 19%)\n",
            "  adding: content/words/example_word11.tif (deflated 14%)\n",
            "  adding: content/words/example_word1.txt (stored 0%)\n",
            "  adding: content/words/example_word4.tif (deflated 6%)\n",
            "  adding: content/words/example_word9.tif (deflated 12%)\n",
            "  adding: content/words/example_word2.tif (deflated 36%)\n",
            "  adding: content/words/example_word17.tif (deflated 6%)\n",
            "  adding: content/words/example_word12.txt (stored 0%)\n",
            "  adding: content/words/example_word1.tif (deflated 32%)\n",
            "  adding: content/words/example_word14.tif (deflated 13%)\n",
            "  adding: content/words/example_word16.tif (deflated 16%)\n",
            "  adding: content/words/example_word5.txt (stored 0%)\n",
            "  adding: content/words/example_word15.txt (stored 0%)\n",
            "  adding: content/words/example_word8.tif (deflated 8%)\n",
            "  adding: content/words/example_word19.tif (deflated 9%)\n",
            "  adding: content/words/example_word21.tif (deflated 19%)\n",
            "  adding: content/words/example_word6.txt (stored 0%)\n",
            "  adding: content/words/example_word5.tif (deflated 14%)\n",
            "  adding: content/words/example_word18.txt (stored 0%)\n",
            "  adding: content/words/example_word10.txt (stored 0%)\n",
            "  adding: content/words/example_word2.txt (stored 0%)\n",
            "  adding: content/words/example_word9.txt (stored 0%)\n",
            "  adding: content/words/example_word22.txt (stored 0%)\n",
            "  adding: content/words/example_word17.txt (stored 0%)\n",
            "  adding: content/words/example_word10.tif (deflated 16%)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_1f554500-6d8f-48cd-9cd4-a089909b926a\", \"words.zip\", 21900)"
            ]
          },
          "metadata": {}
        }
      ]
    }
  ]
}